AndrewNg 机器学习习题ex4-NN back propagation

需要的头：

import matplotlib.pyplot as plt
import numpy as np
import scipy.io as sio
import matplotlib
import scipy.optimize as opt
from sklearn.metrics import classification_report # 这个包是评价报告

Visualizing the data

载入数据：

def load_data(path, transpose=True):
    data = sio.loadmat(path)
    y = data.get('y')
    y = y.reshape(y.shape[0])
    X = data.get('X')
    if transpose:
        X = np.array([im.reshape((20, 20)).T for im in X])
        X = np.array([im.reshape(400) for im in X])
    return X, y


X, y = load_data('./data/ex4data1.mat')


def plot_100_image(X):
    size = int(np.sqrt(X.shape[1]))
    sample_idx = np.random.choice(np.array(X.shape[0]), 100)
    sample_images = X[sample_idx, :]

    fig, ax_array = plt.subplots(nrows=10, ncols=10, sharey=True, sharex=True, figsize=(8, 8))

    for r in range(10):
        for c in range(10):
            ax_array[r, c].matshow(sample_images[10 * r + c].reshape((size, size)), cmap=matplotlib.cm.binary)
            plt.xticks(np.array([]))
            plt.yticks(np.array([]))
    plt.show()
plot_100_image(X)

准备数据

特征集合X添加一列全为1的偏差向量，把目标向量y进行OneHot编码。

X_raw, y_raw = load_data('./data/ex4data1.mat', transpose=False) # 这里转置
X = np.insert(X_raw, 0, np.ones(X_raw.shape[0]), axis=1) # 增加全为1的一列
print(y.shape) # (5000,)
y = np.array([y_raw]).T
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False)
y_onehot = encoder.fit_transform(y)
print(y_onehot.shape) # (5000, 10)

读取权重

先读取出ex4weights.mat中的theta1和theta2，把theta展开后进行扁平化处理。

def load_weight(path):
    data = sio.loadmat(path)
    return data['Theta1'], data['Theta2']


t1, t2 = load_weight('./data/ex4weights.mat')
print(t1.shape, t2.shape) # (25, 401) (10, 26)


def serialize(a, b):
    # np.ravel() 降维
    # np.concatenate() 拼接
    return np.concatenate((np.ravel(a), np.ravel(b)))


def deserialize(seq):
    # 解开为两个theta
    return seq[:25 * 401].reshape(25, 401), seq[25 * 401:].reshape(10, 26)


theta = serialize(t1, t2)
print(theta.shape)  # (25 * 401) + (10 * 26) = 10285

前向传播 feed forward

（400 + 1） -> (25 + 1) -> (1)

def sigmoid(z):
    return 1 / (1 + np.exp(-z))
    
    
def feed_forward(theta, X):
    t1, t2 = deserialize(theta)
    m = X.shape[0]

    a1 = X # 5000 * 401
    z2 = a1 @ t1.T
    a2 = np.insert(sigmoid(z2), 0, np.ones(m), axis=1)  # 5000*26 第一列加一列一
    z3 = a2 @ t2.T  # 5000 * 100
    h = sigmoid(z3)  # 5000 * 10 这是 h_theta(X)

    return a1, z2, a2, z3, h  # 把每一层的计算都返回

#_, _, _, _, h = feed_forward(theta, X)

#print(h.shape) # (5000, 10)

代价函数与正则化

def cost(theta, X, y):
    m = X.shape[0]
    _, _, _, _, h = feed_forward(theta, X)
    pair_computation = -np.multiply(y, np.log(h)) - np.multiply((1 - y), np.log(1 - h))
    return pair_computation.sum() / m


cost_res = cost(theta, X, y)
print("cost:",cost_res)


def regularized_cost(theta, X, y, l=1):
    t1, t2 = deserialize(theta)  # t1: (25,401) t2: (10,26)
    m = X.shape[0]

    reg_t1 = np.power(t1[:, 1:], 2).sum()
    reg_t2 = np.power(t2[:, 1:], 2).sum()
    reg = (1 / (2 * m)) * (reg_t1 + reg_t2)

    return cost(theta, X, y) + reg


regularized_cost_res = regularized_cost(theta, X, y)
print("reg cost:",regularized_cost_res)

反向传播

def sigmoid_gradient(z):
    return np.multiply(sigmoid(z), 1 - sigmoid(z))


print(sigmoid_gradient(0))  #0.25 


def gradient(theta, X, y):
    t1, t2 = deserialize(theta)
    m = X.shape[0]
    deltal = np.zeros(t1.shape)
    delta2 = np.zeros(t2.shape)

    a1, z2, a2, z3, h = feed_forward(theta, X)

    for i in range(m):
        a1i = a1[i, :]
        z2i = z2[i, :]
        a2i = a2[i, :]

        hi = h[i, :]
        yi = y[i, :]

        d3i = hi - yi

        z2i = np.insert(z2i, 0, np.ones(1))
        d2i = np.multiply(t2.T @ d3i, sigmoid_gradient(z2i))

        delta2 += np.matrix(d3i).T @ np.matrix(a2i)
        deltal += np.matrix(d2i[1:]).T @ np.matrix(a1i)


    delta1 = deltal / m
    delta2 = delta2 / m

    return serialize(delta1, delta2)


d1, d2 = deserialize(gradient(theta, X, y))
print(d1.shape, d2.shape) # (25, 401) (10, 26)

梯度校验

梯度正则化：

def regularized_gradient(theta, X, y, l=1):
    """don't regularize theta of bias terms"""
    m = X.shape[0]
    delta1, delta2 = deserialize(gradient(theta, X, y))
    t1, t2 = deserialize(theta)

    t1[:, 0] = 0
    reg_term_d1 = (l / m) * t1
    delta1 = delta1 + reg_term_d1

    t2[:, 0] = 0
    reg_term_d2 = (l / m) * t2
    delta2 = delta2 + reg_term_d2

    return serialize(delta1, delta2)

def expand_array(arr):
    """replicate array into matrix
    [1, 2, 3]

    [[1, 2, 3],
     [1, 2, 3],
     [1, 2, 3]]
    """
    # turn matrix back to ndarray
    return np.array(np.matrix(np.ones(arr.shape[0])).T @ np.matrix(arr))

def gradient_checking(theta, X, y, epsilon, regularized=False):
    def a_numeric_grad(plus, minus, regularized=False):
        """calculate a partial gradient with respect to 1 theta"""
        if regularized:
            return (regularized_cost(plus, X, y) - regularized_cost(minus, X, y)) / (epsilon * 2)
        else:
            return (cost(plus, X, y) - cost(minus, X, y)) / (epsilon * 2)

    theta_matrix = expand_array(theta)  # expand to (10285, 10285)
    epsilon_matrix = np.identity(len(theta)) * epsilon

    plus_matrix = theta_matrix + epsilon_matrix
    minus_matrix = theta_matrix - epsilon_matrix

    # calculate numerical gradient with respect to all theta
    numeric_grad = np.array([a_numeric_grad(plus_matrix[i], minus_matrix[i], regularized)
                                    for i in range(len(theta))])

    # analytical grad will depend on if you want it to be regularized or not
    analytic_grad = regularized_gradient(theta, X, y) if regularized else gradient(theta, X, y)

    # If you have a correct implementation, and assuming you used EPSILON = 0.0001
    # the diff below should be less than 1e-9
    # this is how original matlab code do gradient checking
    diff = np.linalg.norm(numeric_grad - analytic_grad) / np.linalg.norm(numeric_grad + analytic_grad)

    print('If your backpropagation implementation is correct,\nthe relative difference will be smaller than 10e-9 (assume epsilon=0.0001).\nRelative Difference: {}\n'.format(diff))


# gradient_checking(theta, X, y, epsilon= 0.0001)#这个运行很慢，谨慎运行

If your backpropagation implementation is correct,
the relative difference will be smaller than 10e-9 (assume epsilon=0.0001).
Relative Difference: 2.1466000818218673e-09

准备训练模型

def random_init(size):
    return np.random.uniform(-0.12, 0.12, size)

def nn_training(X, y):
    init_theta = random_init(10285) # 25 * 401 + 10 * 26

    res = opt.minimize(fun=regularized_cost,
                       x0=init_theta,
                       args=(X ,y, 1),
                       method='TNC',
                       jac=regularized_gradient,
                       options={'maxiter': 400})
    return res

res = nn_training(X, y) # 慢
print(res)

Out put：

    fun: 0.32211992072588747
    jac: array([ 2.15004329e-04,  3.88985627e-08, -3.33174201e-08, ...,
       3.15328424e-05,  2.82831419e-05, -1.68082404e-05])
message: 'Max. number of function evaluations reached'
   nfev: 400
    nit: 26
 status: 3
success: False
      x: array([ 0.00000000e+00,  1.94492814e-04, -1.66587101e-04, ...,
      -7.15493763e-01, -1.36561388e+00, -2.90127262e+00])

显示准确率

_, y_answer = load_data('./data/ex4data1.mat')

final_theta = res.x

def show_accuracy(theta, X, y):
    _, _, _, _, h = feed_forward(theta, X)

    y_pred = np.argmax(h, axis=1) + 1

    print(classification_report(y, y_pred))

show_accuracy(final_theta, X, y_answer)

Out Put:

             precision    recall  f1-score   support

          1       1.00      0.79      0.88       500
          2       0.73      1.00      0.85       500
          3       0.82      0.99      0.89       500
          4       1.00      0.89      0.94       500
          5       1.00      0.86      0.92       500
          6       0.94      0.99      0.97       500
          7       0.99      0.81      0.89       500
          8       0.94      0.95      0.95       500
          9       0.96      0.95      0.95       500
         10       0.96      0.98      0.97       500

avg / total       0.93      0.92      0.92      5000

显示隐藏层

def plot_hidden_layer(theta):
    """
    theta: (10285, )
    """
    final_theta1, _ = deserialize(theta)
    hidden_layer = final_theta1[:, 1:]  # ger rid of bias term theta

    fig, ax_array = plt.subplots(nrows=5, ncols=5, sharey=True, sharex=True, figsize=(5, 5))

    for r in range(5):
        for c in range(5):
            ax_array[r, c].matshow(hidden_layer[5 * r + c].reshape((20, 20)),
                                   cmap=matplotlib.cm.binary)
            plt.xticks(np.array([]))
            plt.yticks(np.array([]))

plot_hidden_layer(final_theta)
plt.show()